Anomaly Detection Techniques in Python

Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
from IPython.display import HTML 
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from matplotlib import cm
from sklearn.ensemble import IsolationForest
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 
import plotly as plty
import plotly.graph_objs as go
/home/mishraanurag218/anaconda3/lib/python3.8/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
  warnings.warn(message, FutureWarning)

Loading Dataset

In [2]:
def fileRead(directory_path):
    file_name = list()
    for root, dirs, files in os.walk(directory_path):
        for filename in files:
            file_name.append(filename)
    return file_name


    
    
directoryPath1 = "/home/mishraanurag218/Anurag/Projects/Untitled Folder/data/s1/"
directoryPath2 = "/home/mishraanurag218/Anurag/Projects/Untitled Folder/data/s2/"
s1 = fileRead(directoryPath1)
#print(s1)
s2 = fileRead(directoryPath2)
#print(s2)
In [3]:
cols = ['time','acc_frontal','acc_vertical','acc_lateral','id','rssi','phase','frequency','activity']
def folder_to_csv(directory_path,file_name,col_name):
    df_temp = pd.DataFrame()
    for f_n in file_name:
        df = pd.read_csv(directory_path+f_n,names=col_name)
        df['device_id'] = f_n[0:-1]
        df['sex'] = f_n[-1]
        df_temp = pd.concat([df_temp, df], ignore_index=True)
    return df_temp

df_s1 = folder_to_csv(directoryPath1,s1,cols)
df_s2 = folder_to_csv(directoryPath2,s2,cols)

df = pd.concat([df_s1, df_s2], ignore_index=True)

df.head(5)
Out[3]:
time acc_frontal acc_vertical acc_lateral id rssi phase frequency activity device_id sex
0 0.00 0.21 1.02 0.01 1 -55.50 2.70 925.75 1 d1p28 F
1 0.50 0.21 1.02 0.01 3 -66.50 0.64 922.75 1 d1p28 F
2 1.00 0.33 0.96 0.08 1 -55.50 3.85 924.75 1 d1p28 F
3 1.25 0.33 0.96 0.08 1 -57.50 4.72 923.75 1 d1p28 F
4 1.50 0.33 0.96 0.08 4 -55.00 5.99 924.25 1 d1p28 F

Data Pre-processing`

Changing the sex into binary

In [4]:
def categorical_to_binary(x):
    if x=='F':
        return 0
    else:
        return 1

df['sex_b'] = df['sex'].apply(categorical_to_binary)
df.head(5)
Out[4]:
time acc_frontal acc_vertical acc_lateral id rssi phase frequency activity device_id sex sex_b
0 0.00 0.21 1.02 0.01 1 -55.50 2.70 925.75 1 d1p28 F 0
1 0.50 0.21 1.02 0.01 3 -66.50 0.64 922.75 1 d1p28 F 0
2 1.00 0.33 0.96 0.08 1 -55.50 3.85 924.75 1 d1p28 F 0
3 1.25 0.33 0.96 0.08 1 -57.50 4.72 923.75 1 d1p28 F 0
4 1.50 0.33 0.96 0.08 4 -55.00 5.99 924.25 1 d1p28 F 0
In [5]:
df
Out[5]:
time acc_frontal acc_vertical acc_lateral id rssi phase frequency activity device_id sex sex_b
0 0.00 0.21 1.02 0.01 1 -55.50 2.70 925.75 1 d1p28 F 0
1 0.50 0.21 1.02 0.01 3 -66.50 0.64 922.75 1 d1p28 F 0
2 1.00 0.33 0.96 0.08 1 -55.50 3.85 924.75 1 d1p28 F 0
3 1.25 0.33 0.96 0.08 1 -57.50 4.72 923.75 1 d1p28 F 0
4 1.50 0.33 0.96 0.08 4 -55.00 5.99 924.25 1 d1p28 F 0
... ... ... ... ... ... ... ... ... ... ... ... ...
75123 753.75 0.23 0.99 0.03 1 -49.00 5.80 921.75 4 d2p15 F 0
75124 753.83 0.23 0.99 0.03 1 -49.00 5.71 921.75 4 d2p15 F 0
75125 753.88 0.23 0.99 0.03 1 -49.00 5.40 921.75 4 d2p15 F 0
75126 754.00 0.34 0.94 0.16 1 -51.50 6.05 924.25 4 d2p15 F 0
75127 755.25 0.25 0.99 0.02 1 -59.00 1.57 922.25 2 d2p15 F 0

75128 rows × 12 columns

removing non numerical attributes and categorical values and ids as machine learning algorithms only works on numerical values

In [6]:
dfX = df.copy().drop(['sex','device_id','id','sex_b','time'],axis=1)
dfY = df['sex_b'].copy()
dfX.head()
Out[6]:
acc_frontal acc_vertical acc_lateral rssi phase frequency activity
0 0.21 1.02 0.01 -55.50 2.70 925.75 1
1 0.21 1.02 0.01 -66.50 0.64 922.75 1
2 0.33 0.96 0.08 -55.50 3.85 924.75 1
3 0.33 0.96 0.08 -57.50 4.72 923.75 1
4 0.33 0.96 0.08 -55.00 5.99 924.25 1

Making the dataset to standard scale

In [7]:
scaler = MinMaxScaler() 
data = scaler.fit_transform(dfX)
dfX = pd.DataFrame(data, columns = dfX.columns)
dfX.head(5)
Out[7]:
acc_frontal acc_vertical acc_lateral rssi phase frequency activity
0 0.43 0.61 0.53 0.49 0.43 1.00 0.00
1 0.43 0.61 0.53 0.16 0.10 0.45 0.00
2 0.48 0.59 0.55 0.49 0.61 0.82 0.00
3 0.48 0.59 0.55 0.43 0.75 0.64 0.00
4 0.48 0.59 0.55 0.51 0.95 0.73 0.00

Univariate Analysis on dataset

Function for Histogram plot

In [8]:
def hist_plot(x,y):
    for i in y:
        sns.distplot(x[i],bins=150)
        plt.show()
    
In [9]:
cols = ['time','acc_frontal','acc_vertical','acc_lateral','rssi','phase']
hist_plot(df,cols)

Function for joint plot

In [10]:
def joint_plot(x,y,z):
    for i in y:
        sns.jointplot(x=i, y=z, data=x);
        plt.show()
In [11]:
cols = ['acc_frontal','acc_vertical','acc_lateral','rssi','phase']
joint_plot(df,cols,'time')

Pair Plot

In [12]:
sns.set_style("whitegrid");
sns.pairplot(df, hue="sex_b", height=3);
plt.show()

implot

In [13]:
def implot(x,y,z):
    for i in y:
        for j in y:
            if i!=j:
                sns.lmplot(x = i, y = j, data = x, hue = z, col = z)
                plt.show()
                
implot(df,['acc_frontal','acc_vertical','acc_lateral','rssi','phase'],'sex_b')
In [14]:
implot(df,['acc_frontal','acc_vertical','acc_lateral','rssi','phase'],'activity')

countplot

In [15]:
def countPlot(x,y):
    for i in y:
        sns.countplot(x =i, data = x)
        plt.show()
    sns.countplot(x =y[0], hue = y[1], data = x)
    plt.show()
        
countPlot(df,['sex_b','activity'])

Balancing the data

In [16]:
sm = SMOTE(random_state = 2) 
df_X, df_Y = sm.fit_sample(dfX, dfY.ravel())
df_Y = pd.DataFrame(df_Y, columns = ['sex_b'])
In [17]:
sns.countplot(x ='sex_b', data = df_Y)
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0e3e65c1c0>

Lazy predict classification

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y,test_size=.33,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models
 90%|█████████ | 27/30 [03:33<00:57, 19.07s/it]
[14:42:05] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
100%|██████████| 30/30 [03:35<00:00,  7.19s/it]
Out[18]:
Accuracy Balanced Accuracy ROC AUC F1 Score Time Taken
Model
RandomForestClassifier 0.98 0.98 0.98 0.98 6.19
ExtraTreesClassifier 0.98 0.98 0.98 0.98 3.90
BaggingClassifier 0.98 0.98 0.98 0.98 1.79
DecisionTreeClassifier 0.97 0.97 0.97 0.97 0.33
XGBClassifier 0.97 0.97 0.97 0.97 1.46
KNeighborsClassifier 0.96 0.96 0.96 0.96 1.88
ExtraTreeClassifier 0.95 0.95 0.95 0.95 0.11
LGBMClassifier 0.95 0.95 0.95 0.95 0.37
SVC 0.92 0.92 0.92 0.92 54.60
AdaBoostClassifier 0.87 0.87 0.87 0.87 1.96
NuSVC 0.85 0.85 0.85 0.85 121.69
LogisticRegression 0.74 0.74 0.74 0.74 0.14
CalibratedClassifierCV 0.73 0.73 0.73 0.72 15.57
SGDClassifier 0.72 0.72 0.72 0.70 0.19
LinearDiscriminantAnalysis 0.72 0.71 0.71 0.70 0.13
RidgeClassifier 0.72 0.71 0.71 0.70 0.10
RidgeClassifierCV 0.72 0.71 0.71 0.70 0.09
LinearSVC 0.71 0.71 0.71 0.70 4.25
QuadraticDiscriminantAnalysis 0.71 0.71 0.71 0.70 0.08
GaussianNB 0.70 0.70 0.70 0.68 0.09
BernoulliNB 0.68 0.68 0.68 0.67 0.11
NearestCentroid 0.68 0.68 0.68 0.67 0.08
Perceptron 0.52 0.52 0.52 0.51 0.11
CheckingClassifier 0.50 0.50 0.50 0.33 0.07
DummyClassifier 0.50 0.50 0.50 0.50 0.08
PassiveAggressiveClassifier 0.42 0.42 0.42 0.41 0.12

DBSCAN (Density-Based Spatial Clustering of Applications with Noise)

This is a clustering algorithm (an alternative to K-Means) that clusters points together and identifies any points not belonging to a cluster as outliers. It’s like K-means, except the number of clusters does not need to be specified in advance.

The method, step-by-step:

Randomly select a point not already assigned to a cluster or designated as an outlier. Determine if it’s a core point by seeing if there are at least min_samples points around it within epsilon distance.

Create a cluster of this core point and all points within epsilon distance of it (all directly reachable points).

Find all points that are within epsilon distance of each point in the cluster and add them to the cluster. Find all points that are within epsilon distance of all newly added points and add these to the cluster. Rinse and repeat. (i.e. perform “neighborhood jumps” to find all density-reachable points and add them to the cluster).

Sklearn Implementation of DBSCAN:

In [19]:
outlier_detection = DBSCAN(
 eps = .2, 
 metric='euclidean', 
 min_samples = 5,
 n_jobs = -1)
clusters = outlier_detection.fit_predict(dfX)
cmap = cm.get_cmap('Set1')

DBSCAN will output an array of -1’s and 0’s, where -1 indicates an outlier. Below, I visualize outputted outliers in red by plotting two variables.

In [20]:
df.plot.scatter(x='time',y='acc_vertical', c=clusters, cmap=cmap,
 colorbar = False)
plt.show()
In [21]:
# fig = go.Figure(data=go.Scatter(x=df['time'],
#                                 y=df['acc_vertical'],
#                                 mode='markers',
#                                 marker_color=clusters,
#                                 text=clusters)) # hover text goes here

# fig.update_layout(title='Scatter Plot to identify the outliers')
# fig.show()
In [22]:
import plotly.express as px
fig = px.scatter(df, x="time", y="acc_vertical", color=clusters,
                 hover_data=['time'])
fig.show()
fig = px.scatter(df[clusters==-1], x="time", y="acc_vertical",
                 hover_data=['time'])
fig.show()
In [23]:
outliers = np.where(clusters==-1)
df_X_db = df_X.drop(list(outliers[0])) 
df_Y_db = df_Y.drop(list(outliers[0]))
df_dbScan = result = pd.concat([df_X_db,df_Y_db], axis=1, sort=False)
df_dbScan.to_csv (r'Filtered_DBSCAN.csv', index = False, header=True)
print(df_dbScan.head())
   acc_frontal  acc_vertical  acc_lateral  rssi  phase  frequency  activity  \
0         0.43          0.61         0.53  0.49   0.43       1.00      0.00   
1         0.43          0.61         0.53  0.16   0.10       0.45      0.00   
2         0.48          0.59         0.55  0.49   0.61       0.82      0.00   
3         0.48          0.59         0.55  0.43   0.75       0.64      0.00   
4         0.48          0.59         0.55  0.51   0.95       0.73      0.00   

   sex_b  
0      0  
1      0  
2      0  
3      0  
4      0  
In [24]:
sns.countplot(x ='sex_b', data = df_dbScan)

fig = px.histogram(df_dbScan, x="sex_b", color="sex_b")
fig.update_layout(barmode='group')
fig.show()
In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_X_db, df_Y_db,test_size=.33,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models
 90%|█████████ | 27/30 [03:28<00:55, 18.65s/it]
[14:45:51] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
100%|██████████| 30/30 [03:30<00:00,  7.01s/it]
Out[25]:
Accuracy Balanced Accuracy ROC AUC F1 Score Time Taken
Model
ExtraTreesClassifier 0.98 0.98 0.98 0.98 3.81
RandomForestClassifier 0.98 0.98 0.98 0.98 6.11
BaggingClassifier 0.98 0.98 0.98 0.98 1.81
DecisionTreeClassifier 0.97 0.97 0.97 0.97 0.33
XGBClassifier 0.97 0.97 0.97 0.97 1.47
KNeighborsClassifier 0.96 0.96 0.96 0.96 1.81
ExtraTreeClassifier 0.96 0.96 0.96 0.96 0.10
LGBMClassifier 0.95 0.95 0.95 0.95 0.37
SVC 0.92 0.92 0.92 0.92 53.29
AdaBoostClassifier 0.87 0.87 0.87 0.87 1.97
NuSVC 0.85 0.85 0.85 0.85 120.29
LogisticRegression 0.74 0.74 0.74 0.74 0.13
CalibratedClassifierCV 0.73 0.73 0.73 0.72 12.92
QuadraticDiscriminantAnalysis 0.72 0.72 0.72 0.70 0.09
LinearSVC 0.71 0.71 0.71 0.70 4.48
LinearDiscriminantAnalysis 0.71 0.71 0.71 0.70 0.12
RidgeClassifier 0.71 0.71 0.71 0.70 0.09
RidgeClassifierCV 0.71 0.71 0.71 0.70 0.09
SGDClassifier 0.71 0.71 0.71 0.70 0.20
GaussianNB 0.70 0.70 0.70 0.68 0.09
PassiveAggressiveClassifier 0.69 0.69 0.69 0.69 0.12
NearestCentroid 0.68 0.68 0.68 0.67 0.08
BernoulliNB 0.68 0.68 0.68 0.67 0.11
Perceptron 0.68 0.68 0.68 0.66 0.13
CheckingClassifier 0.50 0.50 0.50 0.33 0.07
DummyClassifier 0.50 0.50 0.50 0.50 0.08

Isolation Forests

Randomly select a feature and randomly select a value for that feature within its range.

If the observation’s feature value falls above (below) the selected value, then this value becomes the new min (max) of that feature’s range.

Check if at least one other observation has values in the range of each feature in the dataset, where some ranges were altered via step 2. If no, then the observation is isolated.

Repeat steps 1–3 until the observation is isolated. The number of times you had to go through these steps is the isolation number. The lower the number, the more anomalous the observation is.

Sklearn Implementation of Isolation Forests:

In [26]:
rs=np.random.RandomState(0)
clf = IsolationForest(max_samples=100,random_state=rs, contamination=.1) 
clf.fit(dfX)
if_scores = clf.decision_function(dfX)
if_anomalies=clf.predict(dfX)
if_anomalies=pd.Series(if_anomalies).replace([-1,1],[1,0])
In [27]:
fig = px.scatter(dfX, x="phase", y="acc_vertical", color=if_anomalies,
                 hover_data=['phase'])
fig.show()
fig = px.scatter(dfX[if_anomalies==1], x="phase", y="acc_vertical",
                 hover_data=['phase'])
fig.show()
In [28]:
df_X_if = dfX[if_anomalies!=1]
df_Y_if = dfY[if_anomalies!=1]
df_lf = pd.concat([df_X_if,df_Y_if], axis=1, sort=False)
df_lf.to_csv (r'Filtered_lf.csv', index = False, header=True)
print(df_lf.head())
   acc_frontal  acc_vertical  acc_lateral  rssi  phase  frequency  activity  \
0         0.43          0.61         0.53  0.49   0.43       1.00      0.00   
1         0.43          0.61         0.53  0.16   0.10       0.45      0.00   
2         0.48          0.59         0.55  0.49   0.61       0.82      0.00   
3         0.48          0.59         0.55  0.43   0.75       0.64      0.00   
4         0.48          0.59         0.55  0.51   0.95       0.73      0.00   

   sex_b  
0      0  
1      0  
2      0  
3      0  
4      0  
In [29]:
sns.countplot(x ='sex_b', data = df_lf)
fig = px.histogram(df_lf, x="sex_b", color="sex_b")
fig.update_layout(barmode='group')
fig.show()
In [30]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()
df_train, df_test = train_test_split(df_lf, test_size=0.26)
df_train.to_csv (r'train_lf.csv', index = False, header=True)
df_test.to_csv (r'test_lf.csv', index = False, header=True)
# Import a sample binary outcome train/test set into H2O

train = h2o.import_file("train_lf.csv")
test = h2o.import_file("test_lf.csv")

# Identify predictors and response
x = train.columns
y = "sex_b"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  
Checking whether there is an H2O instance running at http://localhost:54321 . connected.
H2O_cluster_uptime: 54 mins 05 secs
H2O_cluster_timezone: Europe/Dublin
H2O_data_parsing_timezone: UTC
H2O_cluster_version: 3.32.0.2
H2O_cluster_version_age: 28 days, 23 hours and 7 minutes
H2O_cluster_name: H2O_from_python_mishraanurag218_c9hyux
H2O_cluster_total_nodes: 1
H2O_cluster_free_memory: 3.094 Gb
H2O_cluster_total_cores: 12
H2O_cluster_allowed_cores: 12
H2O_cluster_status: locked, healthy
H2O_connection_url: http://localhost:54321
H2O_connection_proxy: {"http": null, "https": null}
H2O_internal_security: False
H2O_API_Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
Python_version: 3.8.3 final
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
model_id auc logloss aucpr mean_per_class_error rmse mse
StackedEnsemble_AllModels_AutoML_20201216_144602 0.9982260.05836690.997783 0.019583 0.1225050.0150074
StackedEnsemble_BestOfFamily_AutoML_20201216_1446020.99822 0.05817460.99777 0.01960640.1224170.014986
DRF_1_AutoML_20201216_144602 0.9980030.06630540.997435 0.02283730.1295340.0167791
XRT_1_AutoML_20201216_144602 0.9979820.06609320.99746 0.02226850.1302530.0169658
GBM_4_AutoML_20201216_144602 0.9977280.05341360.997125 0.01961270.1191360.0141935
GBM_3_AutoML_20201216_144602 0.9975070.05511190.996825 0.019431 0.1198540.0143651
GBM_5_AutoML_20201216_144602 0.9973250.05979540.996651 0.02102610.1260560.0158901
GBM_2_AutoML_20201216_144602 0.9969230.06336420.996033 0.021548 0.1278950.0163572
XGBoost_grid__1_AutoML_20201216_144602_model_4 0.9964480.06643140.995464 0.02253660.1303680.0169957
XGBoost_1_AutoML_20201216_144602 0.9960620.07119850.994986 0.02457240.1360120.0184993
GBM_1_AutoML_20201216_144602 0.9960590.07258140.994628 0.02255010.1353890.0183303
XGBoost_3_AutoML_20201216_144602 0.9956550.075643 0.994385 0.02617340.14003 0.0196083
XGBoost_2_AutoML_20201216_144602 0.99563 0.07635040.994488 0.02545690.1412660.0199562
XGBoost_grid__1_AutoML_20201216_144602_model_2 0.9955750.07639360.994316 0.02545460.1410310.0198899
GBM_grid__1_AutoML_20201216_144602_model_2 0.9950890.08354920.993387 0.02944680.1500350.0225104
XGBoost_grid__1_AutoML_20201216_144602_model_3 0.9946250.08726590.993178 0.02871070.1509760.0227939
GBM_grid__1_AutoML_20201216_144602_model_1 0.9927290.108788 0.990574 0.03637990.1696010.0287646
XGBoost_grid__1_AutoML_20201216_144602_model_1 0.9926670.104157 0.990563 0.03559990.1667190.0277952
DeepLearning_1_AutoML_20201216_144602 0.9733570.205503 0.963157 0.08000320.2466190.060821
DeepLearning_grid__1_AutoML_20201216_144602_model_10.9658690.259087 0.950183 0.08302780.2676 0.0716096
DeepLearning_grid__2_AutoML_20201216_144602_model_10.9653870.248089 0.952324 0.101005 0.2715590.0737444
GLM_1_AutoML_20201216_144602 0.8691610.510126 0.742909 0.178882 0.3999980.159999
Out[30]:

Local Outlier Factor

In [31]:
plt.figure(figsize=(12,8))
plt.hist(if_scores);
plt.title('Histogram of Avg Anomaly Scores: Lower => More Anomalous');

LOF uses density-based outlier detection to identify local outliers, points that are outliers with respect to their local neighborhood, rather than with respect to the global data distribution. The higher the LOF value for an observation, the more anomalous the observation.

This is useful because not all methods will not identify a point that’s an outlier relative to a nearby cluster of points (a local outlier) if that whole region is not an outlying region in the global space of data points.

A point is labeled as an outlier if the density around that point is significantly different from the density around its neighbors.

In [32]:
clf = LocalOutlierFactor(n_neighbors=30, contamination=.1)
y_pred = clf.fit_predict(df_X)
LOF_Scores = clf.negative_outlier_factor_
LOF_pred=pd.Series(y_pred).replace([-1,1],[1,0])
LOF_anomalies=df[LOF_pred==1]
In [33]:
LOF_anomalies
Out[33]:
time acc_frontal acc_vertical acc_lateral id rssi phase frequency activity device_id sex sex_b
2 1.00 0.33 0.96 0.08 1 -55.50 3.85 924.75 1 d1p28 F 0
4 1.50 0.33 0.96 0.08 4 -55.00 5.99 924.25 1 d1p28 F 0
9 3.25 0.33 0.96 0.08 1 -56.00 3.65 925.25 1 d1p28 F 0
13 5.50 0.17 1.02 0.01 4 -59.50 1.47 922.25 1 d1p28 F 0
14 5.75 0.17 1.02 0.01 1 -58.00 0.61 924.75 1 d1p28 F 0
... ... ... ... ... ... ... ... ... ... ... ... ...
75114 745.75 -0.13 0.33 -1.15 3 -65.50 5.90 925.25 3 d2p15 F 0
75115 746.75 0.30 0.89 -0.31 3 -59.50 3.98 925.75 1 d2p15 F 0
75116 747.00 0.30 0.89 -0.31 3 -58.00 1.77 920.75 1 d2p15 F 0
75117 747.25 0.75 0.70 -0.36 3 -60.00 5.37 921.25 1 d2p15 F 0
75118 748.00 0.51 0.82 -0.12 3 -60.50 2.48 925.25 1 d2p15 F 0

7800 rows × 12 columns

In [34]:
cmap=np.array(['white','red'])
plt.scatter(dfX.iloc[:,0],dfX.iloc[:,2],c='white',s=20,edgecolor='k')
plt.scatter(LOF_anomalies.iloc[:,1],LOF_anomalies.iloc[:,2],c='red')
 #,marker=’x’,s=100)
plt.title('Local Outlier Factor — Anomalies')
plt.xlabel('time')
plt.ylabel('acc_vertical')
Out[34]:
Text(0, 0.5, 'acc_vertical')
In [35]:
df_X
Out[35]:
acc_frontal acc_vertical acc_lateral rssi phase frequency activity
0 0.43 0.61 0.53 0.49 0.43 1.00 0.00
1 0.43 0.61 0.53 0.16 0.10 0.45 0.00
2 0.48 0.59 0.55 0.49 0.61 0.82 0.00
3 0.48 0.59 0.55 0.43 0.75 0.64 0.00
4 0.48 0.59 0.55 0.51 0.95 0.73 0.00
... ... ... ... ... ... ... ...
87651 0.78 0.24 0.46 0.42 0.17 1.00 0.67
87652 0.37 0.59 0.58 0.42 0.84 0.64 0.33
87653 0.87 0.18 0.50 0.40 0.87 0.64 0.67
87654 0.83 0.20 0.48 0.46 0.19 0.27 0.67
87655 0.93 0.13 0.55 0.28 0.07 0.27 0.67

87656 rows × 7 columns

In [36]:
fig = px.scatter(df_X, x="phase", y="acc_vertical", color=LOF_pred,
                 hover_data=['phase'])
fig.show()
In [37]:
df_X_lof = df_X[LOF_pred!=1]
df_Y_lof = df_Y[LOF_pred!=1]
df_out =  pd.concat([df_X[LOF_pred==1],df_Y[LOF_pred==1]], axis=1, sort=False)
fig = px.scatter(df_out, x="phase", y="acc_vertical",
                 hover_data=['phase'])
fig.show()
In [38]:
df_lof = pd.concat([df_X_lof,df_Y_lof], axis=1, sort=False)
df_lof.to_csv (r'Filtered_lof.csv', index = False, header=True)
print(df_lof.head())
   acc_frontal  acc_vertical  acc_lateral  rssi  phase  frequency  activity  \
0         0.43          0.61         0.53  0.49   0.43       1.00      0.00   
1         0.43          0.61         0.53  0.16   0.10       0.45      0.00   
3         0.48          0.59         0.55  0.43   0.75       0.64      0.00   
5         0.48          0.59         0.55  0.46   0.69       0.73      0.00   
6         0.48          0.59         0.55  0.31   0.01       0.27      0.00   

   sex_b  
0      0  
1      0  
3      0  
5      0  
6      0  
In [39]:
sns.countplot(x ='sex_b', data = df_lof)
fig = px.histogram(df_lof, x="sex_b", color="sex_b")
fig.update_layout(barmode='group')
fig.show()
In [40]:
X_train, X_test, y_train, y_test = train_test_split(df_X_lof, df_Y_lof,test_size=.33,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models
 90%|█████████ | 27/30 [02:45<00:41, 13.79s/it]
[15:19:58] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
100%|██████████| 30/30 [02:46<00:00,  5.56s/it]
Out[40]:
Accuracy Balanced Accuracy ROC AUC F1 Score Time Taken
Model
ExtraTreesClassifier 0.98 0.98 0.98 0.98 3.24
RandomForestClassifier 0.98 0.98 0.98 0.98 5.27
BaggingClassifier 0.98 0.98 0.98 0.98 1.57
DecisionTreeClassifier 0.98 0.98 0.98 0.98 0.29
XGBClassifier 0.97 0.97 0.97 0.97 1.39
KNeighborsClassifier 0.97 0.97 0.97 0.97 1.49
ExtraTreeClassifier 0.96 0.96 0.96 0.96 0.09
LGBMClassifier 0.96 0.96 0.96 0.96 0.34
SVC 0.93 0.93 0.93 0.93 38.79
AdaBoostClassifier 0.89 0.89 0.89 0.89 1.80
NuSVC 0.85 0.85 0.85 0.85 96.71
LogisticRegression 0.75 0.75 0.75 0.74 0.14
CalibratedClassifierCV 0.73 0.73 0.73 0.72 11.00
QuadraticDiscriminantAnalysis 0.73 0.73 0.73 0.71 0.08
RidgeClassifierCV 0.72 0.72 0.72 0.70 0.08
LinearSVC 0.72 0.72 0.72 0.70 3.45
LinearDiscriminantAnalysis 0.72 0.72 0.72 0.70 0.11
RidgeClassifier 0.72 0.72 0.72 0.70 0.08
GaussianNB 0.71 0.71 0.71 0.69 0.08
SGDClassifier 0.71 0.71 0.71 0.69 0.17
BernoulliNB 0.70 0.70 0.70 0.68 0.08
NearestCentroid 0.69 0.69 0.69 0.68 0.07
Perceptron 0.65 0.65 0.65 0.64 0.11
PassiveAggressiveClassifier 0.56 0.56 0.56 0.56 0.13
CheckingClassifier 0.50 0.50 0.50 0.33 0.06
DummyClassifier 0.50 0.50 0.50 0.50 0.07